1 Setup

suppressPackageStartupMessages({
    library(data.table)
    library(DESeq2)
    library(gplots)
    library(here)
    library(hyperSpec)
    library(RColorBrewer)
    library(tidyverse)
    library(VennDiagram)
})
suppressMessages({
    source(here("UPSCb-common/src/R/featureSelection.R"))
    source(here("UPSCb-common/src/R/plotMA.R"))
    source(here("UPSCb-common/src/R/volcanoPlot.R"))
    source(here("UPSCb-common/src/R/gopher.R"))
})
pal=brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
  1. plot specific gene expression
"line_plot" <- function(dds=dds,vst=vst,gene_id=gene_id){
    message(paste("Plotting",gene_id))
    sel <- grepl(gene_id,rownames(vst))
    stopifnot(sum(sel)==1)

    p <- ggplot(bind_cols(as.data.frame(colData(dds)),
                          data.frame(value=vst[sel,])),
                aes(x=Time,y=value,col=Time,group=Time)) +
        geom_point() + geom_smooth() +
        scale_y_continuous(name="VST expression") + 
        ggtitle(label=paste("Expression for: ",gene_id))
    
    suppressMessages(suppressWarnings(plot(p)))
    return(NULL)
}
  1. extract the DE results. Default cutoffs are from Schurch et al., RNA, 2016
"extract_results" <- function(dds,vst,contrast,
                              padj=0.01,lfc=0.5,
                              plot=TRUE,verbose=TRUE,
                              export=TRUE,default_dir=here("data/analysis/DE"),
                              default_prefix="DE-",
                              labels=colnames(dds),
                              sample_sel=1:ncol(dds),
                              expression_cutoff=0,
                              debug=FALSE){
    
    if(length(contrast)==1){
        res <- results(dds,name=contrast)
    } else {
        res <- results(dds,contrast=contrast)
    }
    
    if(plot){
        par(mar=c(5,5,5,5))
        volcanoPlot(res)
        par(mar=mar)
    }
    
    # a look at independent filtering
    if(plot){
        plot(metadata(res)$filterNumRej,
             type="b", ylab="number of rejections",
             xlab="quantiles of filter")
        lines(metadata(res)$lo.fit, col="red")
        abline(v=metadata(res)$filterTheta)
    }
    
    if(verbose){
        message(sprintf("The independent filtering cutoff is %s, removing %s of the data",
                        round(metadata(res)$filterThreshold,digits=5),
                        names(metadata(res)$filterThreshold)))
    
        max.theta <- metadata(res)$filterNumRej[which.max(metadata(res)$filterNumRej$numRej),"theta"]
        message(sprintf("The independent filtering maximises for %s %% of the data, corresponding to a base mean expression of %s (library-size normalised read)",
                        round(max.theta*100,digits=5),
                        round(quantile(counts(dds,normalized=TRUE),probs=max.theta),digits=5)))
    }
    
    if(plot){
        qtl.exp=quantile(counts(dds,normalized=TRUE),probs=metadata(res)$filterNumRej$theta)
        dat <- data.frame(thetas=metadata(res)$filterNumRej$theta,
                          qtl.exp=qtl.exp,
                          number.degs=sapply(lapply(qtl.exp,function(qe){
                              res$padj <= padj & abs(res$log2FoldChange) >= lfc & 
                                  ! is.na(res$padj) & res$baseMean >= qe
                          }),sum))
        
        plot(ggplot(dat,aes(x=thetas,y=qtl.exp)) + 
            geom_line() + geom_point() +
            scale_x_continuous("quantiles of expression") + 
            scale_y_continuous("base mean expression") +
            geom_hline(yintercept=expression_cutoff,
                       linetype="dotted",col="red"))
        
        if(debug){
            p <- ggplot(dat,aes(x=thetas,y=qtl.exp)) + 
                geom_line() + geom_point() +
                scale_x_continuous("quantiles of expression") + 
                scale_y_log10("base mean expression") + 
                geom_hline(yintercept=expression_cutoff,
                           linetype="dotted",col="red")
            suppressMessages(suppressWarnings(plot(p)))
            
            plot(ggplot(dat,aes(x=thetas,y=number.degs)) + 
                     geom_line() + geom_point() +
                     geom_hline(yintercept=dat$number.degs[1],linetype="dashed") +
                     scale_x_continuous("quantiles of expression") + 
                     scale_y_continuous("Number of DE genes"))
            
            plot(ggplot(dat,aes(x=thetas,y=number.degs[1] - number.degs),aes()) + 
                     geom_line() + geom_point() +
                     scale_x_continuous("quantiles of expression") + 
                     scale_y_continuous("Cumulative number of DE genes"))
            
            plot(ggplot(data.frame(x=dat$thetas[-1],
                                   y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) + 
                     geom_line() + geom_point() +
                     scale_x_continuous("quantiles of expression") + 
                     scale_y_continuous("Number of DE genes per interval"))
            
            plot(ggplot(data.frame(x=dat$qtl.exp[-1],
                                   y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) + 
                     geom_line() + geom_point() +
                     scale_x_continuous("base mean of expression") + 
                     scale_y_continuous("Number of DE genes per interval"))
        }
        p <- ggplot(data.frame(x=dat$qtl.exp[-1],
                          y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) + 
            geom_line() + geom_point() +
            scale_x_log10("base mean of expression") + 
            scale_y_continuous("Number of DE genes per interval") + 
            geom_vline(xintercept=expression_cutoff,
                       linetype="dotted",col="red")
        suppressMessages(suppressWarnings(plot(p)))
    }
    
    sel <- res$padj <= padj & abs(res$log2FoldChange) >= lfc & ! is.na(res$padj) & 
        res$baseMean >= expression_cutoff
    
    if(verbose){
        message(sprintf("There are %s genes that are DE with the following parameters: FDR <= %s, |log2FC| >= %s, base mean expression > %s",
                        sum(sel),
                        lfc,padj,expression_cutoff))
    }
    
    val <- rowSums(vst[sel,sample_sel])==0
    if (sum(val) >0){
        warning(sprintf("There are %s DE genes that have no vst expression in the selected samples",sum(val)))
        sel[sel][val] <- FALSE
    }    
        
    if(export){
        if(!dir.exists(default_dir)){
            dir.create(default_dir,showWarnings=FALSE,recursive=TRUE,mode="0771")
        }
        write.csv(res,file=file.path(default_dir,paste0(default_prefix,"results.csv")))
        write.csv(res[sel,],file.path(default_dir,paste0(default_prefix,"genes.csv")))
    }
    if(plot){
        heatmap.2(t(scale(t(vst[sel,sample_sel]))),
                  distfun = pearson.dist,
                  hclustfun = function(X){hclust(X,method="ward.D2")},
                  trace="none",col=hpal,labRow = FALSE,
                  labCol=labels[sample_sel]
        )
    }
    return(list(all=rownames(res[sel,]),
                up=rownames(res[sel & res$log2FoldChange > 0,]),
                dn=rownames(res[sel & res$log2FoldChange < 0,])))
}
load(here("data/analysis/salmon/dds-sample-swap-corrected.rda"))

1.1 Normalisation for visualisation

vsd <- varianceStabilizingTransformation(dds,blind=FALSE)
vst <- assay(vsd)
vst <- vst - min(vst)
dir.create(here("data/analysis/DE"),showWarnings=FALSE)
save(vst,file=here("data/analysis/DE/vst-aware.rda"))

1.2 Gene of interests

We do not have any goi at that time

# goi <- read_lines(here("doc/goi.txt"))
# stopifnot(all(goi %in% rownames(vst)))
# dev.null <- lapply(goi,line_plot,dds=dds,vst=vst)

1.3 Differential Expression

dds <- DESeq(dds)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
  • Dispersion estimation The dispersion estimation is adequate
plotDispEsts(dds)

Check the different contrasts

resultsNames(dds)
## [1] "Intercept"          "Time_120hrs_vs_std" "Time_12hrs_vs_std" 
## [4] "Time_24hrs_vs_std"  "Time_4hrs_vs_std"   "Time_60min_vs_std" 
## [7] "Time_72hrs_vs_std"

1.4 Results

# fail
T1vsCtl <- extract_results(dds=dds,vst=vst,
                           contrast="Time_60min_vs_std",
                           default_prefix="DE-Time-1h-vs-std",
                           labels=paste(dds$Time,dds$Replicate,sep="-"),
                           sample_sel=dds$Time %in% c("60min","std"))
## Loading required package: LSD

## The independent filtering cutoff is 0.96719, removing 15.61273% of the data
## The independent filtering maximises for 19.30516 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)

## There are 10728 genes that are DE with the following parameters: FDR <= 0.5, |log2FC| >= 0.01, base mean expression > 0
## Warning in extract_results(dds = dds, vst = vst, contrast =
## "Time_60min_vs_std", : There are 53 DE genes that have no vst expression in the
## selected samples

T4vs1 <- extract_results(dds=dds,vst=vst,
                         contrast=list("Time_4hrs_vs_std","Time_60min_vs_std"),
                         default_prefix="DE-Time-4h-vs-1h",
                         labels=paste(dds$Time,dds$Replicate,sep="-"),
                         sample_sel=dds$Time %in% c("4hrs","60min"))

## The independent filtering cutoff is 1.24531, removing 17.45895% of the data
## The independent filtering maximises for 21.15138 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)

## There are 10962 genes that are DE with the following parameters: FDR <= 0.5, |log2FC| >= 0.01, base mean expression > 0

T12vs4 <- extract_results(dds=dds,vst=vst,
                          contrast=list("Time_12hrs_vs_std","Time_4hrs_vs_std"),
                          default_prefix="DE-Time-12h-vs-4h",
                          labels=paste(dds$Time,dds$Replicate,sep="-"),
                          sample_sel=dds$Time %in% c("12hrs","4hrs"))

## The independent filtering cutoff is 1.57275, removing 19.30516% of the data
## The independent filtering maximises for 22.9976 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)

## There are 9583 genes that are DE with the following parameters: FDR <= 0.5, |log2FC| >= 0.01, base mean expression > 0

T24vs12 <- extract_results(dds=dds,vst=vst,contrast=list("Time_24hrs_vs_std","Time_12hrs_vs_std"),
                           default_prefix="DE-Time-24h-vs-12h",
                           labels=paste(dds$Time,dds$Replicate,sep="-"),
                           sample_sel=dds$Time %in% c("24hrs","12hrs"))

## The independent filtering cutoff is 1.24531, removing 17.45895% of the data
## The independent filtering maximises for 21.15138 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)

## There are 9701 genes that are DE with the following parameters: FDR <= 0.5, |log2FC| >= 0.01, base mean expression > 0

T72vs24 <- extract_results(dds=dds,vst=vst,contrast=list("Time_72hrs_vs_std","Time_24hrs_vs_std"),
                           default_prefix="DE-Time-72h-vs-24h",
                           labels=paste(dds$Time,dds$Replicate,sep="-"),
                           sample_sel=dds$Time %in% c("72hrs","24hrs"))

## The independent filtering cutoff is 1.24531, removing 17.45895% of the data
## The independent filtering maximises for 21.15138 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)

## There are 4944 genes that are DE with the following parameters: FDR <= 0.5, |log2FC| >= 0.01, base mean expression > 0

T120vs72 <- extract_results(dds=dds,vst=vst,contrast=list("Time_120hrs_vs_std","Time_72hrs_vs_std"),
                            default_prefix="DE-Time-120h-vs-72h",
                            labels=paste(dds$Time,dds$Replicate,sep="-"),
                            sample_sel=dds$Time %in% c("120hrs","72hrs"))

## The independent filtering cutoff is 2.36286, removing 22.9976% of the data
## The independent filtering maximises for 34.07489 % of the data, corresponding to a base mean expression of 1.82808 (library-size normalised read)

## There are 534 genes that are DE with the following parameters: FDR <= 0.5, |log2FC| >= 0.01, base mean expression > 0

1.4.1 Venn Diagram

res.list <- list(T1vsCtl=T1vsCtl,
                 T4vs1=T4vs1,
                 T12vs4=T12vs4,
                 T24vs12=T24vs12,
                 T72vs24=T72vs24,
                 T120vs72=T120vs72)

1.4.1.1 Fast response

grid.newpage()
grid.draw(venn.diagram(lapply(res.list[1:2],"[[","all"),
                       NULL,
                       fill=pal[1:2]))

grid.newpage()
grid.draw(venn.diagram(lapply(res.list[1:2],"[[","up"),
                       NULL,
                       fill=pal[1:2]))

grid.newpage()
grid.draw(venn.diagram(lapply(res.list[1:2],"[[","dn"),
                       NULL,
                       fill=pal[1:2]))

1.4.1.2 Switch

grid.newpage()
grid.draw(venn.diagram(lapply(res.list[2:4],"[[","all"),
                       NULL,
                       fill=pal[1:3]))

grid.newpage()
grid.draw(venn.diagram(lapply(res.list[2:4],"[[","up"),
                       NULL,
                       fill=pal[1:3]))

grid.newpage()
grid.draw(venn.diagram(lapply(res.list[2:4],"[[","dn"),
                       NULL,
                       fill=pal[1:3]))

1.4.1.3 Acclimatization

grid.newpage()
grid.draw(venn.diagram(lapply(res.list[4:6],"[[","all"),
                       NULL,
                       fill=pal[1:3]))

grid.newpage()
grid.draw(venn.diagram(lapply(res.list[4:6],"[[","up"),
                       NULL,
                       fill=pal[1:3]))

grid.newpage()
grid.draw(venn.diagram(lapply(res.list[4:6],"[[","dn"),
                       NULL,
                       fill=pal[1:3]))

1.4.2 Gene Ontology enrichment

background <- rownames(vst)[featureSelect(vst,dds$Time,exp=0.1)]

enr.list <- lapply(res.list,function(r){
    lapply(r,gopher,background=background,task="go",url="algae")
})
## Loading required package: jsonlite
## 
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
## 
##     flatten
## No enrichments found in task: go
dev.null <- lapply(names(enr.list),function(n){
    r <- enr.list[[n]]
    if(!is.null(r$all$go)){
        write_delim(r$all$go,path=file.path(file.path(here("data/analysis/DE",
                                                           paste0(n,"-all-DE-genes_GO-enrichment.txt")))))
        write_delim(r$all$go[,c("id","padj")],path=file.path(file.path(here("data/analysis/DE",
                                                                            paste0(n,"-all-DE-genes_GO-enrichment_for-REVIGO.txt")))))
    }
    if(!is.null(r$up$go)){
        write_csv(r$up$go,path=file.path(file.path(here("data/analysis/DE",
                                                        paste0(n,"-up-DE-genes_GO-enrichment.txt")))))
        write_delim(r$up$go[,c("id","padj")],path=file.path(file.path(here("data/analysis/DE",
                                                                           paste0(n,"-up-DE-genes_GO-enrichment_for-REVIGO.txt")))))    
    }
    if(!is.null(r$dn$go)){
        write_csv(r$dn$go,path=file.path(file.path(here("data/analysis/DE",
                                                        paste0(n,"-down-DE-genes_GO-enrichment.txt")))))
        write_delim(r$dn$go[,c("id","padj")],path=file.path(file.path(here("data/analysis/DE",
                                                                           paste0(n,"-down-DE-genes_GO-enrichment_for-REVIGO.txt")))))    
    }
})

2 Session Info

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] jsonlite_1.6                LSD_4.0-0                  
##  [3] limma_3.42.0                VennDiagram_1.6.20         
##  [5] futile.logger_1.4.3         forcats_0.4.0              
##  [7] stringr_1.4.0               dplyr_0.8.3                
##  [9] purrr_0.3.3                 readr_1.3.1                
## [11] tidyr_1.0.0                 tibble_2.1.3               
## [13] tidyverse_1.3.0             RColorBrewer_1.1-2         
## [15] hyperSpec_0.99-20180627     ggplot2_3.2.1              
## [17] lattice_0.20-38             here_0.1                   
## [19] gplots_3.0.1.1              DESeq2_1.26.0              
## [21] SummarizedExperiment_1.16.0 DelayedArray_0.12.0        
## [23] BiocParallel_1.20.0         matrixStats_0.55.0         
## [25] Biobase_2.46.0              GenomicRanges_1.38.0       
## [27] GenomeInfoDb_1.22.0         IRanges_2.20.1             
## [29] S4Vectors_0.24.0            BiocGenerics_0.32.0        
## [31] data.table_1.12.6          
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.4-1       rprojroot_1.3-2        htmlTable_1.13.2      
##  [4] XVector_0.26.0         base64enc_0.1-3        fs_1.3.1              
##  [7] rstudioapi_0.10        farver_2.0.1           bit64_0.9-7           
## [10] AnnotationDbi_1.48.0   lubridate_1.7.4        xml2_1.2.2            
## [13] splines_3.6.1          geneplotter_1.64.0     knitr_1.26            
## [16] zeallot_0.1.0          Formula_1.2-3          broom_0.5.2           
## [19] annotate_1.64.0        cluster_2.1.0          dbplyr_1.4.2          
## [22] compiler_3.6.1         httr_1.4.1             backports_1.1.5       
## [25] assertthat_0.2.1       Matrix_1.2-18          lazyeval_0.2.2        
## [28] cli_1.1.0              formatR_1.7            acepack_1.4.1         
## [31] htmltools_0.4.0        tools_3.6.1            gtable_0.3.0          
## [34] glue_1.3.1             GenomeInfoDbData_1.2.2 Rcpp_1.0.3            
## [37] cellranger_1.1.0       vctrs_0.2.0            gdata_2.18.0          
## [40] nlme_3.1-142           xfun_0.11              testthat_2.3.0        
## [43] rvest_0.3.5            lifecycle_0.1.0        gtools_3.8.1          
## [46] XML_3.98-1.20          zlibbioc_1.32.0        scales_1.1.0          
## [49] hms_0.5.2              lambda.r_1.2.4         curl_4.2              
## [52] yaml_2.2.0             memoise_1.1.0          gridExtra_2.3         
## [55] rpart_4.1-15           latticeExtra_0.6-28    stringi_1.4.3         
## [58] RSQLite_2.1.2          highr_0.8              genefilter_1.68.0     
## [61] checkmate_1.9.4        caTools_1.17.1.2       rlang_0.4.2           
## [64] pkgconfig_2.0.3        bitops_1.0-6           evaluate_0.14         
## [67] labeling_0.3           htmlwidgets_1.5.1      bit_1.1-14            
## [70] tidyselect_0.2.5       magrittr_1.5           R6_2.4.1              
## [73] generics_0.0.2         Hmisc_4.3-0            DBI_1.0.0             
## [76] pillar_1.4.2           haven_2.2.0            foreign_0.8-72        
## [79] withr_2.1.2            survival_3.1-7         RCurl_1.95-4.12       
## [82] nnet_7.3-12            modelr_0.1.5           crayon_1.3.4          
## [85] futile.options_1.0.1   KernSmooth_2.23-16     rmarkdown_1.18        
## [88] locfit_1.5-9.1         readxl_1.3.1           blob_1.2.0            
## [91] reprex_0.3.0           digest_0.6.23          xtable_1.8-4          
## [94] munsell_0.5.0